library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(nlme)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(ggplot2)
library(dplyr)
library(sjPlot) #for plotting lmer and glmer mods
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(groupdata2)
library(hydroGOF) # rmse()
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
mydata <- read_csv("covid.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## name = col_character(),
## state = col_character(),
## region = col_character(),
## govParty = col_character(),
## metro_area = col_character(),
## main_econ = col_character()
## )
## See spec(...) for full column specifications.
mydata <- mydata %>%
select(-c(1:2,29:31))
mydata <- mydata[is.finite(mydata$covid_death_rate_log),]
mydata$name <- as.factor(mydata$name)
mydata$region <- as.factor(mydata$region)
mydata$state <- as.factor(mydata$state)
mydata$govParty <- as.factor(mydata$govParty)
mydata$metro_area <- as.factor(mydata$metro_area)
mydata$main_econ <- as.factor(mydata$main_econ)
mydata$GQ_ESTIMATES_2019 <- log(mydata$GQ_ESTIMATES_2019)
mydata <- mydata %>%
filter(GQ_ESTIMATES_2019 != '-Inf') %>%
mutate(density = total_population/area_sqmi)
head(mydata)
#dim(mydata)
#death_rate <- exp(mydata$covid_death_rate_log)*100
#summary(death_rate)
summary(mydata)
summary(mydata$name)
\(name\): county demographic factors: \(total\_population\), \(percent\_fair\_or\_poor\_health\), \(per\_capita\_income\) social factors: \(percent\_uninsured\), \(percent\_minorities\), \(percent\_below\_poverty\), \(percentile\_rank\_social\_vulnerability\) geographical factors: \(region\) and \(state\) economic factors: \(main\_econ\) and \(metro\_area\) political factors: \(pro\_Trump\) and \(govParty\)
\(GQ\_ESTIMATES\_2019\): the total number of county residents living in group quarters in 2019 (e.g. nursing homes, residential treatment centers, student housing, religious convents/monasteries, correctional facilities, and hospitals)
ggplot(mydata, aes(x=covid_death_rate_log)) +
geom_histogram(fill = "cornflowerblue", color = "white") +
facet_wrap(~region, ncol = 4) +
labs(title = "Death Rates by Region")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata, aes(x=covid_death_rate_log)) +
geom_histogram(fill = "cornflowerblue", color = "white") +
facet_wrap(~metro_area, ncol = 3) +
labs(title = "Death Rates by Metro Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata, aes(x=covid_death_rate_log)) +
geom_histogram(fill = "cornflowerblue", color = "white") +
facet_wrap(~govParty, ncol = 2) +
labs(title = "Death Rates by govParty")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata, aes(x = region, y = covid_death_rate_log)) +
geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) +
labs(title = "Death Rates by Region")
ggplot(mydata, aes(x = metro_area, y = covid_death_rate_log)) +
geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) +
labs(title = "Death Rates by Metro Area")
ggplot(mydata, aes(x = govParty, y = covid_death_rate_log)) +
geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) +
labs(title = "Death Rates by govParty")
ggplot(mydata, aes(x = main_econ, y = covid_death_rate_log)) +
geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) +
labs(title = "Death Rates by Main Economy")
Northeast seems to have higher log(death rates). metro_area, govParty, main_econ do not seem to have significant impacts on death rates.
ggplot(mydata, aes(x = GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = per_capita_income, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = proTrump, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = percent_below_poverty, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = percent_fair_or_poor_health, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = percent_minorities, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = percent_uninsured, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mydata, aes(x = percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
Positive association between group quarters and log(death rates) Negative association between proTrump and log(death rates). Positive association between percent_below_poverty and log(death rates) Positive association between percent_fair_or_poor_health and log(death rates) Positive association between percent_minorities and log(death rates) Positive association between percentile_rank_social_vulnerability and log(death rates) No obvious relationship between log(death rates) and percent_uninsured and per capita income
# subset counties -> Columbia in various states
plotdata <- mydata %>%
filter(name == "Columbia")
# basic Cleveland plot
ggplot(plotdata, aes(x=name, y=covid_death_rate_log)) +
geom_point()
# subset counties -> Franklin in various states
plotdata <- mydata %>%
filter(name == "Franklin")
# basic Cleveland plot
ggplot(plotdata, aes(x=name, y=covid_death_rate_log)) +
geom_point()
# plot death rates by group quarters, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by percent below poverty, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by proTrump, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by percent_fair_or_poor_health, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by percent_minorities, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by percentile_rank_social_vulnerability, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by percent_uninsured, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
# plot death rates by per capita income, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~region) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~metro_area) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~main_econ) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
geom_smooth(se=FALSE, method="lm", color="grey") +
geom_point(color="blue") +
facet_wrap(~govParty) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in Death Rates")
dotchart(sort(xtabs(~ mydata$name)), cex=0.7)
## Warning in dotchart(sort(xtabs(~mydata$name)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)
dotchart(sort(xtabs(~ mydata$state)), cex=0.7)
## Warning in dotchart(sort(xtabs(~mydata$state)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)
dotchart(sort(xtabs(~ mydata$region)), cex=0.7)
## Warning in dotchart(sort(xtabs(~mydata$region)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)
Distribution of potential random effects variables. There are many \(name\) (county), each only contribute one instances). There's many states, with a pretty skewed distribution -> the state effects can be worrying. Finally, there's only few 4 regions.
model1 <- lmer(covid_death_rate_log ~ (1|name) + (1|state) + (1|region), data=mydata)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0173038 (tol = 0.002, component 1)
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ (1 | name) + (1 | state) + (1 | region)
## Data: mydata
##
## REML criterion at convergence: 7666.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.6748 -0.4441 0.0924 0.5982 3.8124
##
## Random effects:
## Groups Name Variance Std.Dev.
## name (Intercept) 0.0007534 0.02745
## state (Intercept) 0.2034087 0.45101
## region (Intercept) 0.2351881 0.48496
## Residual 0.8376222 0.91522
## Number of obs: 2832, groups: name, 1682; state, 49; region, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.0512 0.2536 -15.98
## convergence code: 0
## Model failed to converge with max|grad| = 0.0173038 (tol = 0.002, component 1)
The estimated variances for the random effects. For \(name\), the optimal variance is 0.00123, which is very close to zero: the model gains nothing by attributing the response to county variation. So \(name\) isn’t a significant effect in the model. Hence we can drop the random effect for \(name\). But for \(state\) and \(region\), the choice of state and region introduce a larger variance.
model2 <- lmer(covid_death_rate_log ~ (1|state) + (1|region), data=mydata)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ (1 | state) + (1 | region)
## Data: mydata
##
## REML criterion at convergence: 7666.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.6754 -0.4459 0.0921 0.5980 3.8140
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.2034 0.4510
## region (Intercept) 0.2327 0.4823
## Residual 0.8384 0.9156
## Number of obs: 2832, groups: state, 49; region, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.0513 0.2523 -16.06
Dropping random effect for \(name\) makes almost no difference to the model fit.
model3 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + main_econ + metro_area + (1|state) + (1|region), data=mydata)
model4 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + main_econ + (1|state) + (1|region), data=mydata)
model5 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + metro_area + (1|state) + (1|region), data=mydata)
anova(model3, model4)
anova(model3, model5)
metro_area and main_econ significant
model6 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
anova(model3, model6)
govParty not significant
model7 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + main_econ + metro_area + (1|state) + (1|region), data=mydata)
anova(model6, model7)
percentile_rank_social_vulnerability significant at 0.05 threshold
model8 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
anova(model6, model8)
percent_uninsured insignificant
model9 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
model10 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
anova(model8, model9)
anova(model8, model10)
percent_minorities significant at 0.05 threshold percent_fair_or_poor_health insignificant
model11 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
model12 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
model13 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
model14 <- lmer(covid_death_rate_log ~ per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
anova(model10, model11)
anova(model11, model12)
anova(model12, model13)
anova(model13, model14)
percent_below_poverty and proTrump insignificant per_capita_income and GQ_ESTIMATES_2019 very significant
model15 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
summary(model15)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +
## percent_minorities + percentile_rank_social_vulnerability +
## main_econ + metro_area + (1 | state) + (1 | region)
## Data: mydata
##
## REML criterion at convergence: 7632.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2222 -0.4600 0.0812 0.5804 4.0108
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.1430 0.3782
## region (Intercept) 0.1988 0.4459
## Residual 0.8146 0.9025
## Number of obs: 2832, groups: state, 49; region, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.785e+00 2.914e-01 -16.419
## GQ_ESTIMATES_2019 -3.329e-02 1.431e-02 -2.327
## per_capita_income 2.348e-05 5.243e-06 4.478
## percent_minorities 4.725e-03 1.440e-03 3.281
## percentile_rank_social_vulnerability 4.197e-01 1.321e-01 3.177
## main_econgovernment -8.718e-02 7.949e-02 -1.097
## main_econmanufacturing 7.835e-02 7.291e-02 1.075
## main_econmining 5.928e-02 8.895e-02 0.666
## main_econnonspecialized 1.972e-01 6.621e-02 2.978
## main_econrecreation 1.119e-01 8.161e-02 1.371
## metro_areametro -2.048e-02 4.657e-02 -0.440
## metro_areanon-metro -2.021e-01 4.738e-02 -4.264
##
## Correlation of Fixed Effects:
## (Intr) GQ_EST pr_cp_ prcnt_ prc___ mn_cng mn_cnmnf mn_cnmnn mn_cnn
## GQ_ESTIMATE -0.115
## per_cpt_ncm -0.502 -0.232
## prcnt_mnrts 0.116 -0.042 -0.202
## prcntl_rn__ -0.395 -0.229 0.666 -0.517
## mn_cngvrnmn -0.065 -0.342 0.051 -0.034 -0.002
## mn_cnmnfctr -0.078 -0.162 -0.033 0.044 -0.075 0.609
## main_cnmnng -0.040 -0.096 -0.074 0.029 -0.073 0.458 0.458
## mn_cnnnspcl -0.059 -0.265 -0.043 0.040 -0.076 0.717 0.728 0.527
## man_cnrcrtn -0.076 -0.105 -0.092 0.068 -0.019 0.532 0.553 0.421 0.632
## metro_armtr 0.064 -0.236 -0.171 -0.135 0.145 0.015 0.023 0.029 -0.048
## mtr_rnn-mtr -0.107 0.089 0.002 -0.028 0.016 0.028 0.083 -0.028 0.049
## mn_cnr mtr_rm
## GQ_ESTIMATE
## per_cpt_ncm
## prcnt_mnrts
## prcntl_rn__
## mn_cngvrnmn
## mn_cnmnfctr
## main_cnmnng
## mn_cnnnspcl
## man_cnrcrtn
## metro_armtr 0.051
## mtr_rnn-mtr 0.008 0.348
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
model16 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region) + per_capita_income*metro_area, data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
model17 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region) + per_capita_income*metro_area + percent_minorities*per_capita_income, data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
anova(model16, model15)
## refitting model(s) with ML (instead of REML)
anova(model17, model16)
## refitting model(s) with ML (instead of REML)
final.model <- model17
summary(final.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +
## percent_minorities + percentile_rank_social_vulnerability +
## main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *
## metro_area + percent_minorities * per_capita_income
## Data: mydata
##
## REML criterion at convergence: 7693.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2456 -0.4479 0.0794 0.5721 4.0786
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.1387 0.3725
## region (Intercept) 0.1742 0.4174
## Residual 0.8124 0.9013
## Number of obs: 2832, groups: state, 49; region, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -5.054e+00 3.814e-01 -13.250
## GQ_ESTIMATES_2019 -3.157e-02 1.443e-02 -2.188
## per_capita_income 3.360e-05 1.127e-05 2.981
## percent_minorities 1.142e-02 3.736e-03 3.057
## percentile_rank_social_vulnerability 4.971e-01 1.380e-01 3.601
## main_econgovernment -1.002e-01 7.948e-02 -1.261
## main_econmanufacturing 7.343e-02 7.283e-02 1.008
## main_econmining 7.092e-02 8.891e-02 0.798
## main_econnonspecialized 1.842e-01 6.623e-02 2.781
## main_econrecreation 1.089e-01 8.216e-02 1.325
## metro_areametro -2.380e-01 2.143e-01 -1.110
## metro_areanon-metro 1.536e-01 2.306e-01 0.666
## per_capita_income:metro_areametro 8.657e-06 8.933e-06 0.969
## per_capita_income:metro_areanon-metro -1.536e-05 9.819e-06 -1.564
## per_capita_income:percent_minorities -3.538e-07 1.693e-07 -2.090
##
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
#print(final.model, corr=F)
covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +
percent_minorities + percentile_rank_social_vulnerability +
main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *
metro_area + percent_minorities * per_capita_income
sjPlot::tab_model(final.model, show.re.var= TRUE,
#pred.labels =c("(Intercept)", "per capita income:percent minorities"...),
dv.labels= "Effect of Socio-economic Factors on County COVID Death Rates")
| Effect of Socio-economic Factors on County COVID Death Rates | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -5.05 | -5.80 – -4.31 | <0.001 |
| GQ_ESTIMATES_2019 | -0.03 | -0.06 – -0.00 | 0.029 |
| per_capita_income | 0.00 | 0.00 – 0.00 | 0.003 |
| percent_minorities | 0.01 | 0.00 – 0.02 | 0.002 |
| percentile_rank_social_vulnerability | 0.50 | 0.23 – 0.77 | <0.001 |
| main_econ [government] | -0.10 | -0.26 – 0.06 | 0.207 |
| main_econ [manufacturing] | 0.07 | -0.07 – 0.22 | 0.313 |
| main_econ [mining] | 0.07 | -0.10 – 0.25 | 0.425 |
|
main_econ [nonspecialized] |
0.18 | 0.05 – 0.31 | 0.005 |
| main_econ [recreation] | 0.11 | -0.05 – 0.27 | 0.185 |
| metro_area [metro] | -0.24 | -0.66 – 0.18 | 0.267 |
| metro_area [non-metro] | 0.15 | -0.30 – 0.61 | 0.505 |
|
per_capita_income * metro_area [metro] |
0.00 | -0.00 – 0.00 | 0.332 |
|
per_capita_income * metro_area [non-metro] |
-0.00 | -0.00 – 0.00 | 0.118 |
|
per_capita_income * percent_minorities |
-0.00 | -0.00 – -0.00 | 0.037 |
| Random Effects | |||
| σ2 | 0.81 | ||
| τ00 state | 0.14 | ||
| τ00 region | 0.17 | ||
| ICC | 0.28 | ||
| N state | 49 | ||
| N region | 4 | ||
| Observations | 2832 | ||
| Marginal R2 / Conditional R2 | 0.041 / 0.308 | ||
Effects are pretty much all significant: GQ_ESTIMATES_2019, per_capita_income, percent_minorities, percentile_rank_social_vulnerability (positive), main_econ [nonspecialized] (baseline: farming), per_capita_income*percent_minorities
#plot for fixed effects
#axis labels should be in order from bottom to top
#To see the values of the effect size and p-value, set show.values and show.p= TRUE.
#P-values will only be shown if the effect size values are too
sjPlot::plot_model(final.model,
#axis.labels=c("per capita income:percent minorities"...),
show.values=TRUE, show.p=TRUE,
title="Effect of Socio-economic Factors on County COVID Death Rates")
plot(final.model, type=c("p", smooth))
qqnorm(residuals(final.model))
ggplot(data.frame(lev=hatvalues(final.model),
pearson=residuals(final.model, type="pearson")),
aes(x=lev, y=pearson)) +
geom_point() +
theme_bw()
Residaul plot looks good and no obvious heteroskedasticity. Normality is also good. Seems to have some high leverage points.
# scale-location plots
plot(final.model, sqrt(abs(resid(.))) ~ fitted(.), type = c("p", "smooth"))
#creating a training set
smp_size <- floor(0.8 * nrow(mydata))
set.seed(1)
train_ind <- sample(seq_len(nrow(mydata)), size = smp_size)
train <- mydata[train_ind, ]
test <- mydata[-train_ind, ]
#obtaining the mean-squared error for training set
pred.tr <- predict(final.model, train)
MSE.tr <- sum((pred.tr - train$covid_death_rate_log)^2)/nrow(train)
MSE.tr
## [1] 0.7978782
#obtaining the mean-squared error for test set
pred.te <- predict(final.model, test)
MSE.te <- sum((pred.te - test$covid_death_rate_log)^2)/nrow(test)
MSE.te
## [1] 0.7915444
#obtaining the mean-squared error for dataset
pred <- predict(final.model, mydata)
MSE <- sum((pred - mydata$covid_death_rate_log)^2)/nrow(mydata)
MSE
## [1] 0.7966101
#https://cran.r-project.org/web/packages/groupdata2/vignettes/cross-validation_with_groupdata2.html
set.seed(1)
train <- fold(train, k = 10)
train <- train %>% arrange(.folds) # order by .folds
crossvalidate <- function(data, k, model, dependent, random = FALSE){
# 'data' is the training set with the ".folds" column
# 'k' is the number of folds we have
# 'model' is a string describing a linear regression model formula
# 'dependent' is a string with the name of the score column we want to predict
# 'random' is a logical; do we have random effects in the model?
# Initialize empty list for recording performances
performances <- c()
# One iteration per fold
for (fold in 1:k){
# Create training set for this iteration
# Subset all the datapoints where .folds does not match the current fold
training_set <- data[data$.folds != fold,]
# Create test set for this iteration
# Subset all the datapoints where .folds matches the current fold
testing_set <- data[data$.folds == fold,]
## Train model
# If there is a random effect,
# use lmer() to train model
# else use lm()
if (isTRUE(random)){
# Train linear mixed effects model on training set
model <- lmer(model, training_set, REML=FALSE)
} else {
# Train linear model on training set
model <- lm(model, training_set)
}
## Test model
# Predict the dependent variable in the testing_set with the trained model
predicted <- predict(model, testing_set, allow.new.levels=TRUE)
# Get the Root Mean Square Error between the predicted and the observed
RMSE <- rmse(predicted, testing_set[[dependent]])
# Add the RMSE to the performance list
performances[fold] <- RMSE
}
# Return the mean of the recorded RMSEs
#return(performances)
#return(c('RMSE' = mean(performances)))
return(mean(performances))
}
#library(hydroGOF) # rmse()
set.seed(1)
RMSE.tr <- crossvalidate(train, k=10, final.model, dependent="covid_death_rate_log", random = TRUE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
MSE.tr <- RMSE.tr^2
MSE.tr
## [1] 0.8341988
MSE: 0.8341988
# Creating the model for the full training set
cv.model <- lmer(final.model, train, REML = FALSE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00292805 (tol = 0.002, component 1)
# Predict the dependent variable in the test_set with the trained model
predicted <- predict(cv.model, test, allow.new.levels=TRUE)
# Get the Root Mean Square Error between the predicted and the observed
RMSE.te <- rmse(predicted, test[["covid_death_rate_log"]])
MSE.te <- RMSE.te^2
MSE.te
## [1] 0.8273282
MSE: 0.8273282
# Summarize the results
summary(cv.model)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +
## percent_minorities + percentile_rank_social_vulnerability +
## main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *
## metro_area + percent_minorities * per_capita_income
## Data: train
##
## AIC BIC logLik deviance df.resid
## 6087.3 6190.4 -3025.7 6051.3 2247
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.1585 -0.4569 0.0798 0.5680 3.9468
##
## Random effects:
## Groups Name Variance Std.Dev.
## state (Intercept) 0.1262 0.3552
## region (Intercept) 0.0980 0.3131
## Residual 0.8102 0.9001
## Number of obs: 2265, groups: state, 49; region, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -5.089e+00 3.887e-01 -13.093
## GQ_ESTIMATES_2019 -2.276e-02 1.600e-02 -1.423
## per_capita_income 3.292e-05 1.262e-05 2.608
## percent_minorities 1.264e-02 4.253e-03 2.972
## percentile_rank_social_vulnerability 4.556e-01 1.531e-01 2.976
## main_econgovernment -6.087e-02 8.968e-02 -0.679
## main_econmanufacturing 5.506e-02 8.184e-02 0.673
## main_econmining 6.630e-02 9.929e-02 0.668
## main_econnonspecialized 1.891e-01 7.438e-02 2.542
## main_econrecreation 1.373e-01 9.162e-02 1.499
## metro_areametro -2.025e-01 2.383e-01 -0.850
## metro_areanon-metro 1.110e-01 2.612e-01 0.425
## per_capita_income:metro_areametro 7.980e-06 9.931e-06 0.804
## per_capita_income:metro_areanon-metro -1.323e-05 1.109e-05 -1.193
## per_capita_income:percent_minorities -4.110e-07 1.931e-07 -2.129
##
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## Model failed to converge with max|grad| = 0.00292805 (tol = 0.002, component 1)
sjPlot::tab_model(cv.model, show.re.var= TRUE,
#pred.labels =c("(Intercept)", "per capita income:percent minorities"...),
dv.labels= "Effect of Socio-economic Factors on County COVID Death Rates")
| Effect of Socio-economic Factors on County COVID Death Rates | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -5.09 | -5.85 – -4.33 | <0.001 |
| GQ_ESTIMATES_2019 | -0.02 | -0.05 – 0.01 | 0.155 |
| per_capita_income | 0.00 | 0.00 – 0.00 | 0.009 |
| percent_minorities | 0.01 | 0.00 – 0.02 | 0.003 |
| percentile_rank_social_vulnerability | 0.46 | 0.16 – 0.76 | 0.003 |
| main_econ [government] | -0.06 | -0.24 – 0.11 | 0.497 |
| main_econ [manufacturing] | 0.06 | -0.11 – 0.22 | 0.501 |
| main_econ [mining] | 0.07 | -0.13 – 0.26 | 0.504 |
|
main_econ [nonspecialized] |
0.19 | 0.04 – 0.33 | 0.011 |
| main_econ [recreation] | 0.14 | -0.04 – 0.32 | 0.134 |
| metro_area [metro] | -0.20 | -0.67 – 0.26 | 0.395 |
| metro_area [non-metro] | 0.11 | -0.40 – 0.62 | 0.671 |
|
per_capita_income * metro_area [metro] |
0.00 | -0.00 – 0.00 | 0.422 |
|
per_capita_income * metro_area [non-metro] |
-0.00 | -0.00 – 0.00 | 0.233 |
|
per_capita_income * percent_minorities |
-0.00 | -0.00 – -0.00 | 0.033 |
| Random Effects | |||
| σ2 | 0.81 | ||
| τ00 state | 0.13 | ||
| τ00 region | 0.10 | ||
| ICC | 0.22 | ||
| N state | 49 | ||
| N region | 4 | ||
| Observations | 2265 | ||
| Marginal R2 / Conditional R2 | 0.042 / 0.250 | ||
GQ_ESTIMATES_2019 no longer significant
# Predict the dependent variable in the full data with the trained model
predicted <- predict(cv.model, mydata, allow.new.levels=TRUE)
# Get the Root Mean Square Error between the predicted and the observed
RMSE <- rmse(predicted, mydata[["covid_death_rate_log"]])
MSE <- RMSE^2
MSE
## [1] 0.8023507
MSE: 0.8023507